Import¶

In [1]:
import pandas as pd
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
import warnings
import itertools

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import het_breuschpagan

# Ignore all warnings
warnings.filterwarnings("ignore")
In [2]:
import arima_prophet_functions
Importing plotly failed. Interactive plots will not work.

Purpose of this notebook¶

Этот ноутбук показывает обучение лучших ARIMA и Prophet моделей для данных загрузки ОНМК центров Санкт-Петербурга.

Данные представляют собой временной ряд месячной - загрузки центров с января 2017 по ноябрь 2024 года.

Оценка будет происходить суммарной загрузки центров в месяц, инными словами оценка будет проводиться для всех госпиталей сразу, не по отдельности.

Data¶

Считываем данные и задаём основную колонку для анализа

In [3]:
!ls data

# MICE RF с госпиталями
data_path_MR = "data/data_summarized_by_month_filled_hosp_MR.tsv"

data_hosp_MR = pd.read_csv(data_path_MR, sep="\t")
F1.csv
F1.xlsx
data_summarized_by_month_filled.tsv
data_summarized_by_month_filled_hosp.tsv
data_summarized_by_month_filled_hosp_MR.tsv
In [4]:
main_column = 'Пролечены_с_ОНМК_Всего'

Data Preparation¶

In [5]:
# MICE RF with hospitals
data_hosp_MR_used = data_hosp_MR.copy()

# Убираем ненужный госпиталь
data_hosp_MR_used = data_hosp_MR_used.loc[~data_hosp_MR_used.hosp.isin(["med1"])]

# Добавляем колонки с датой
data_hosp_MR_used.loc[:, 'Date'] = pd.to_datetime(data_hosp_MR_used[['year', 'month']].assign(Day=1))
data_hosp_MR_used.loc[:, 'Date_Graph'] = pd.to_datetime(data_hosp_MR_used[['year', 'month']].assign(Day=1)).dt.strftime('%Y-%m')

# Берем основные колонки
data_hosp_MR_used = data_hosp_MR_used.loc[:, [main_column, 'Date', 'Date_Graph', 'year', 'month']]

# Группируем и суммируем по госпиталям
data_hosp_MR_used = data_hosp_MR_used.groupby(['Date', 'Date_Graph', 'year', 'month']).sum().reset_index()

# Берем все время до ноября 2024
data_hosp_MR_used = data_hosp_MR_used.loc[data_hosp_MR_used.Date < pd.to_datetime("2024-10-31")]

TS Checks¶

Compare TimeSeries¶

In [6]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Create the figure and axes
# fig, axes = plt.subplots(1, 1, figsize=(12, 12), sharex=True)
# axes = axes.flatten()

# # Define the datasets and titles
# datasets = [data_basic_used, data_hosp_used, data_hosp_MR_used]
# titles = ["All_Hospitals 2017-2024 (Basic)", 
#           "All_Hospitals 2017-2024 (Hospitals)", 
#           "All_Hospitals 2017-2024 (MR Hospitals)"]

# # Loop through axes and datasets
# for i, ax in enumerate(axes):
#     # Copy the data to avoid modifying the original
#     data_plot = datasets[i].copy()
    
#     # Plot the data
#     ax.plot(data_plot['Date_Graph'], data_plot[main_column], 
#             label="All_Hospitals", marker='o', color="gray")
    
#     # Set the title and labels
#     ax.set_title(titles[i], fontsize=14)
#     ax.set_ylabel("Hospital Load", fontsize=12)
    
#     # Set the xticks and labels
#     xticks = np.arange(len(data_plot['Date_Graph']))  # Index positions
#     skip_ticks = xticks[::2]  # Take every second tick
#     ax.set_xticks(skip_ticks)
#     ax.set_xticklabels(data_plot['Date_Graph'].iloc[skip_ticks], rotation=90)
    
#     # Add grid and legend
#     ax.grid(True, which='both', linestyle='--', linewidth=0.5)
#     ax.legend(fontsize=12)

# # Set shared x-axis label
# axes[-1].set_xlabel("Year-Month", fontsize=12)

# # Adjust layout and add a super title
# plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust for supertitle
# plt.suptitle("Hospital Load Over Time (2017-2024)", fontsize=16)

# # Show the plot
# plt.show()

Изучать наш временной ряд будем на отрезке 2017-2023 года. 2024 год будет отложен для тестирования модели.

In [7]:
data_used_model = data_hosp_MR_used.copy()

data_used_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")]
data_used_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")]

Decompose¶

In [8]:
data_ts = data_hosp_MR_used.loc[data_hosp_MR_used.year.isin([2017, 2018, 2019, 2020, 2021, 2022, 2023])][[main_column, 'Date']]
data_ts = pd.Series(data_ts[main_column])
data_ts.index = pd.date_range(start="2017-01-01", periods=84, freq="M")
In [9]:
# Decompose the time series
decomposition = seasonal_decompose(data_ts, model='additive')
seasonal = decomposition.seasonal
residual = decomposition.resid

# Calculate variances
residual_variance = np.var(residual.dropna())
seasonal_variance = np.var(seasonal.dropna())

# Calculate Seasonal Strength
seasonal_strength = 1 - (residual_variance / (residual_variance + seasonal_variance))

print(f"Seasonal Strength: {seasonal_strength:.3f}")
fig = decomposition.plot()  # `decomposition.plot()` returns a Figure
fig.set_size_inches(6, 6)  # Optional: Adjust the figure size

# Rotate x-axis tick labels for each subplot
for ax in fig.axes:
    for label in ax.get_xticklabels():
        label.set_rotation(45)

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()
Seasonal Strength: 0.306

Dickey-Fuller test¶

DF w/o differencing¶

In [10]:
from statsmodels.tsa.stattools import adfuller
In [11]:
data_plot = data_hosp_MR_used.copy()
data_plot.set_index('Date', inplace=True)
data_plot = data_plot.dropna().fillna(0)

# Perform the Dickey-Fuller test
result = adfuller(data_plot[main_column])

# Display results
print(f'*** {"All_Hospitals"} ***')
print("Dickey-Fuller Test Results:")
print(f"Test Statistic: {result[0]}")
print(f"p-value: {result[1]}")
print(f"Number of Lags Used: {result[2]}")
print(f"Number of Observations Used: {result[3]}")
print("Critical Values:")
for key, value in result[4].items():
    print(f"   {key}: {value}")
*** All_Hospitals ***
Dickey-Fuller Test Results:
Test Statistic: -3.036366478552415
p-value: 0.031634988394189344
Number of Lags Used: 9
Number of Observations Used: 84
Critical Values:
   1%: -3.510711795769895
   5%: -2.8966159448223734
   10%: -2.5854823866213152

Видим стационарность, но не слабую p=0.03, Попробуем продифференцировать ряд.

In [12]:
plt.plot(data_plot[main_column])
Out[12]:
[<matplotlib.lines.Line2D at 0x7f199b29c8b0>]

DF test with differencing¶

In [13]:
data_plot = data_hosp_MR_used.copy()
data_plot.set_index('Date', inplace=True)
data_plot = data_plot.dropna().fillna(0)

# дифференцирование
series = data_plot[main_column].diff(1).dropna()

# Perform the Dickey-Fuller test
result = adfuller(series)

# Display results
print(f'*** {"All_Hospitals"} ***')
print("Dickey-Fuller Test Results:")
print(f"Test Statistic: {result[0]}")
print(f"p-value: {result[1]}")
print(f"Number of Lags Used: {result[2]}")
print(f"Number of Observations Used: {result[3]}")
print("Critical Values:")
for key, value in result[4].items():
    print(f"   {key}: {value}")
*** All_Hospitals ***
Dickey-Fuller Test Results:
Test Statistic: -4.928434258839517
p-value: 3.0692049629707045e-05
Number of Lags Used: 5
Number of Observations Used: 87
Critical Values:
   1%: -3.5078527246648834
   5%: -2.895382030636155
   10%: -2.584823877658872
In [14]:
plt.plot(data_plot[main_column].diff(1).dropna())
Out[14]:
[<matplotlib.lines.Line2D at 0x7f1999340040>]

Оп, уже стационарность побольше! И ряд сам "выравнивается" - нет линейных трендов, что хорошо для ARIMA моделей. Поэтому будем использовать дифференцированный ряд.

PACF/ACF¶

In [15]:
# Select the Load time series
load_series = pd.Series(data_used_train[main_column]).diff(1).dropna()

# Plot ACF and PACF
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ACF plot
plot_acf(load_series, ax=axes[0], lags=40)  # Adjust lags as needed
axes[0].set_title('Autocorrelation Function (ACF)')
axes[0].set_xticks(range(0, 40, 2))

# PACF plot
plot_pacf(load_series, ax=axes[1], lags=40, method='ywm')  # 'ywm' is robust for small samples
axes[1].set_title('Partial Autocorrelation Function (PACF)')
axes[1].set_xticks(range(0, 40, 2))

plt.tight_layout()
plt.show()

Что тут видим: ...

My Auto_Arima¶

Так как работа пакета "auto_arima" меня ну строила, я написал собственный грид-серч с использованием itertools.

Ниже будет показан тестовый прогон моего кода:

ARIMA¶

In [16]:
# Example usage
# Assuming `time_series_data` is your pd.Series object
results = arima_prophet_functions.grid_search_arima(
    data=data_used_train[main_column],
    min_p=0,
    max_p=5,
    min_q=0,
    max_q=5,
    d=1,
    min_P=0,
    max_P=0,
    min_Q=0,
    max_Q=0,
    min_D=0,
    max_D=0,
    seasonal_period=12,
    trends=["n"]
)
In [17]:
results.sort_values('AIC').head()
Out[17]:
p q d P Q D seasonal_period trend AIC
29 4 5 1 0 0 0 12 n 1045.354399
17 2 5 1 0 0 0 12 n 1047.165300
35 5 5 1 0 0 0 12 n 1047.412144
23 3 5 1 0 0 0 12 n 1047.771729
5 0 5 1 0 0 0 12 n 1054.054548

SARIMA¶

In [18]:
# Example usage
# Assuming `time_series_data` is your pd.Series object
results = arima_prophet_functions.grid_search_arima(
    data=data_used_train[main_column],
    min_p=0,
    max_p=5,
    min_q=0,
    max_q=5,
    d=1,
    min_P=1,
    max_P=1,
    min_Q=1,
    max_Q=1,
    min_D=0,
    max_D=0,
    seasonal_period=12,
    trends=["n"]
)
In [19]:
results.sort_values('AIC').head()
Out[19]:
p q d P Q D seasonal_period trend AIC
29 4 5 1 1 1 0 12 n 883.948428
17 2 5 1 1 1 0 12 n 884.057333
23 3 5 1 1 1 0 12 n 886.649834
35 5 5 1 1 1 0 12 n 887.223640
5 0 5 1 1 1 0 12 n 889.524956
In [20]:
sns.boxplot(data=results, y="AIC", x="q")
Out[20]:
<Axes: xlabel='q', ylabel='AIC'>

ARIMA models¶

Модуль обучения различных ARIMA моделей и поиска лучшей

Learn Models¶

ARIMA(1, 1, 1)¶

In [21]:
data_used_model = data_hosp_MR_used.copy()

train_month=84 #2017-2023

data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
In [22]:
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(1, 1, 1), train_month=train_month, trend='n')
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())

metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
                                 SARIMAX Results                                  
==================================================================================
Dep. Variable:     Пролечены_с_ОНМК_Всего   No. Observations:                   84
Model:                   SARIMAX(1, 1, 1)   Log Likelihood                -554.497
Date:                    Thu, 30 Jan 2025   AIC                           1114.995
Time:                            19:51:15   BIC                           1122.178
Sample:                                 0   HQIC                          1117.877
                                     - 84                                         
Covariance Type:                      opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.2334      0.131     -1.779      0.075      -0.491       0.024
ma.L1         -0.8828      0.066    -13.475      0.000      -1.011      -0.754
sigma2      5.131e+04   9767.889      5.253      0.000    3.22e+04    7.05e+04
===================================================================================
Ljung-Box (L1) (Q):                   0.04   Jarque-Bera (JB):                 1.42
Prob(Q):                              0.85   Prob(JB):                         0.49
Heteroskedasticity (H):               0.96   Skew:                             0.17
Prob(H) (two-sided):                  0.91   Kurtosis:                         2.44
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
*** TRAIN ***
MAE: 214.14
MSE: 82432.94
RMSE: 287.11
MAPE: 12.82%
AIC: 1114.9945345742562

*** TEST ***
MAE: 217.73
MSE: 59746.17
RMSE: 244.43
MAPE: 15.10%
Out[22]:
Dataset TRAIN TEST
MAE 214.137296 217.734950
MSE 82432.935135 59746.172263
RMSE 287.111364 244.430301
MAPE (%) 12.819832 15.103578
AIC 1114.994535 NaN
In [23]:
arima_prophet_functions.model_performance(data_check_metrics, main_column)

Ljung-Box Test
      lb_stat  lb_pvalue
10  12.362577   0.261523
In [24]:
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp", 
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

Rolling Prediction¶

In [25]:
train_month_start = 84
train_month_end = 94
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (1,1,1), 
                                                       train_month_start=train_month_start, 
                                                       train_month_end=train_month_end)
In [26]:
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN ***
MAE: 214.14
MSE: 82432.94
RMSE: 287.11
MAPE: 12.82%
AIC: 1114.9945345742562

*** TEST ***
MAE: 199.50
MSE: 45576.68
RMSE: 213.49
MAPE: 13.53%
Out[26]:
Dataset TRAIN TEST
MAE 214.137296 199.501572
MSE 82432.935135 45576.676463
RMSE 287.111364 213.486947
MAPE (%) 12.819832 13.531049
AIC 1114.994535 NaN
In [27]:
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

ARIMA(2, 1, 3)¶

In [28]:
data_used_model = data_hosp_MR_used.copy()

train_month=84 #2017-2023

data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
In [29]:
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), train_month=train_month, trend='n')
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())

metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
                                 SARIMAX Results                                  
==================================================================================
Dep. Variable:     Пролечены_с_ОНМК_Всего   No. Observations:                   84
Model:                   SARIMAX(2, 1, 3)   Log Likelihood                -529.077
Date:                    Thu, 30 Jan 2025   AIC                           1070.153
Time:                            19:51:17   BIC                           1084.370
Sample:                                 0   HQIC                          1075.849
                                     - 84                                         
Covariance Type:                      opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -1.1369      0.017    -65.637      0.000      -1.171      -1.103
ar.L2         -0.9950      0.019    -51.576      0.000      -1.033      -0.957
ma.L1          0.3204      0.139      2.309      0.021       0.048       0.592
ma.L2         -0.1072      0.123     -0.874      0.382      -0.348       0.133
ma.L3         -0.9042      0.200     -4.515      0.000      -1.297      -0.512
sigma2      3.546e+04   8.05e-06   4.41e+09      0.000    3.55e+04    3.55e+04
===================================================================================
Ljung-Box (L1) (Q):                   0.32   Jarque-Bera (JB):                 3.66
Prob(Q):                              0.57   Prob(JB):                         0.16
Heteroskedasticity (H):               0.80   Skew:                            -0.32
Prob(H) (two-sided):                  0.58   Kurtosis:                         2.16
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.09e+25. Standard errors may be unstable.
*** TRAIN ***
MAE: 203.51
MSE: 98045.29
RMSE: 313.12
MAPE: 12.13%
AIC: 1070.153429480983

*** TEST ***
MAE: 147.26
MSE: 32045.75
RMSE: 179.01
MAPE: 10.00%
Out[29]:
Dataset TRAIN TEST
MAE 203.510432 147.257832
MSE 98045.286626 32045.747262
RMSE 313.121840 179.013260
MAPE (%) 12.129705 10.000349
AIC 1070.153429 NaN
In [30]:
arima_prophet_functions.model_performance(data_check_metrics, main_column)

Ljung-Box Test
      lb_stat  lb_pvalue
10  20.968363   0.021315
In [31]:
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp", 
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

Rolling Prediction¶

In [32]:
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3), 
                                                       train_month_start=train_month_start, 
                                                       train_month_end=train_month_end)
In [33]:
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN ***
MAE: 203.51
MSE: 98045.29
RMSE: 313.12
MAPE: 12.13%
AIC: 1070.153429480983

*** TEST ***
MAE: 121.60
MSE: 24424.27
RMSE: 156.28
MAPE: 8.16%
Out[33]:
Dataset TRAIN TEST
MAE 203.510432 121.601240
MSE 98045.286626 24424.271559
RMSE 313.121840 156.282666
MAPE (%) 12.129705 8.158376
AIC 1070.153429 NaN
In [34]:
final_data_check_metrics
Out[34]:
Date Date_Graph Пролечены_с_ОНМК_Всего Пролечены_с_ОНМК_Всего_Fitted Пролечены_с_ОНМК_Всего_Prediction Пролечены_с_ОНМК_Всего_Prediction_CI_low Пролечены_с_ОНМК_Всего_Prediction_CI_upp
0 2017-01-01 2017-01 1446.0 0.000000 None None None
1 2017-02-01 2017-02 1859.0 624.025294 None None None
2 2017-03-01 2017-03 2285.0 1093.769856 None None None
3 2017-04-01 2017-04 1678.0 1397.157963 None None None
4 2017-05-01 2017-05 1657.0 1921.589855 None None None
... ... ... ... ... ... ... ...
89 2024-06-01 2024-06 1446.0 NaN 1551.242293 1162.663339 1939.821247
90 2024-07-01 2024-07 1349.0 NaN 1459.59544 1073.438359 1845.752521
91 2024-08-01 2024-08 1802.0 NaN 1830.500051 1443.776983 2217.22312
92 2024-09-01 2024-09 1322.0 NaN 1458.503434 1094.459409 1822.54746
93 2024-10-01 2024-10 1729.0 NaN 1457.02495 1094.054345 1819.995554

94 rows × 7 columns

In [35]:
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         cutoff_date="12-30-2023",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
                                           vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

SARIMA(2, 1, 3)(1, 0, 1, 12)¶

In [36]:
data_used_model = data_hosp_MR_used.copy()

train_month=84 #2017-2023

data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]
In [37]:
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), train_month=train_month, trend='n',
                         seasonal_order=(1, 0, 1, 12))
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column)
print(model.summary())

metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:               Пролечены_с_ОНМК_Всего   No. Observations:                   84
Model:             SARIMAX(2, 1, 3)x(1, 0, [1], 12)   Log Likelihood                -445.405
Date:                              Thu, 30 Jan 2025   AIC                            906.810
Time:                                      19:51:18   BIC                            924.447
Sample:                                           0   HQIC                           913.789
                                               - 84                                         
Covariance Type:                                opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -1.1596      0.016    -73.907      0.000      -1.190      -1.129
ar.L2         -1.0245      0.024    -43.393      0.000      -1.071      -0.978
ma.L1          0.4167      0.366      1.138      0.255      -0.301       1.134
ma.L2          0.0988      0.403      0.246      0.806      -0.690       0.888
ma.L3         -0.7514      0.481     -1.563      0.118      -1.694       0.191
ar.S.L12       0.6709      0.126      5.344      0.000       0.425       0.917
ma.S.L12      -0.7560      0.428     -1.766      0.077      -1.595       0.083
sigma2      3.129e+04   1.86e+04      1.679      0.093   -5239.739    6.78e+04
===================================================================================
Ljung-Box (L1) (Q):                   1.34   Jarque-Bera (JB):                 1.22
Prob(Q):                              0.25   Prob(JB):                         0.54
Heteroskedasticity (H):               0.77   Skew:                            -0.26
Prob(H) (two-sided):                  0.54   Kurtosis:                         2.59
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
*** TRAIN ***
MAE: 217.10
MSE: 110861.39
RMSE: 332.96
MAPE: 12.83%
AIC: 906.8095059859434

*** TEST ***
MAE: 112.86
MSE: 16697.32
RMSE: 129.22
MAPE: 7.36%
Out[37]:
Dataset TRAIN TEST
MAE 217.096661 112.856308
MSE 110861.391485 16697.321712
RMSE 332.958543 129.218117
MAPE (%) 12.832502 7.362880
AIC 906.809506 NaN
In [38]:
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp", 
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)
In [39]:
arima_prophet_functions.model_performance(data_check_metrics, main_column)

Ljung-Box Test
      lb_stat  lb_pvalue
10  18.975142    0.04058

Rolling Prediction¶

In [40]:
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3), seasonal_order=(1, 0, 1, 12), train_month_start=84, train_month_end=94)
In [41]:
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN ***
MAE: 217.10
MSE: 110861.39
RMSE: 332.96
MAPE: 12.83%
AIC: 906.8095059859434

*** TEST ***
MAE: 108.89
MSE: 17436.32
RMSE: 132.05
MAPE: 7.02%
Out[41]:
Dataset TRAIN TEST
MAE 217.096661 108.888046
MSE 110861.391485 17436.315568
RMSE 332.958543 132.046642
MAPE (%) 12.832502 7.019480
AIC 906.809506 NaN
In [42]:
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         cutoff_date="12-30-2023",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
                                           vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

SARIMAX (with exogeneous params)(2, 1, 3)(1, 0, 1, 12)¶

In [43]:
data_used_model['COVID'] = data_used_model['year'] == 2020
data_used_model['COVID'] = data_used_model['COVID'].astype(int) 
In [44]:
train_month=84 #2017-2023

data_used_train = data_used_model.iloc[:train_month]
data_used_test = data_used_model.iloc[train_month:]

Train Best Model¶

In [45]:
model = arima_prophet_functions.train_arima_model(data_used_model[main_column], order=(2, 1, 3), trend="n", seasonal_order=(1, 0, 1, 12),
                                     exog=data_used_model['COVID'], train_month=84)
data_check_metrics = arima_prophet_functions.make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=np.array(data_used_test['COVID']))
print(model.summary())

metrics_df = arima_prophet_functions.calculate_model_metrics(model, data_check_metrics, main_column)
metrics_df
                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:               Пролечены_с_ОНМК_Всего   No. Observations:                   84
Model:             SARIMAX(2, 1, 3)x(1, 0, [1], 12)   Log Likelihood                -445.533
Date:                              Thu, 30 Jan 2025   AIC                            909.066
Time:                                      19:51:22   BIC                            928.908
Sample:                                           0   HQIC                           916.918
                                               - 84                                         
Covariance Type:                                opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            55.4004    110.798      0.500      0.617    -161.759     272.560
ar.L1         -1.1615      0.014    -83.646      0.000      -1.189      -1.134
ar.L2         -1.0268      0.020    -50.661      0.000      -1.067      -0.987
ma.L1          0.4316      6.096      0.071      0.944     -11.516      12.379
ma.L2          0.0841      5.512      0.015      0.988     -10.720      10.888
ma.L3         -0.7658      7.783     -0.098      0.922     -16.021      14.489
ar.S.L12       0.6622      0.101      6.567      0.000       0.465       0.860
ma.S.L12      -1.0187      5.655     -0.180      0.857     -12.103      10.065
sigma2      2.264e+04   2.15e+05      0.105      0.916   -3.99e+05    4.45e+05
===================================================================================
Ljung-Box (L1) (Q):                   0.81   Jarque-Bera (JB):                 1.23
Prob(Q):                              0.37   Prob(JB):                         0.54
Heteroskedasticity (H):               0.77   Skew:                            -0.22
Prob(H) (two-sided):                  0.55   Kurtosis:                         2.50
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 7.12e+14. Standard errors may be unstable.
*** TRAIN ***
MAE: 218.59
MSE: 111355.65
RMSE: 333.70
MAPE: 12.93%
AIC: 909.0662050492497

*** TEST ***
MAE: 118.64
MSE: 17788.33
RMSE: 133.37
MAPE: 7.63%
Out[45]:
Dataset TRAIN TEST
MAE 218.589756 118.636750
MSE 111355.647097 17788.328890
RMSE 333.699936 133.372894
MAPE (%) 12.925759 7.627024
AIC 909.066205 NaN
In [46]:
arima_prophet_functions.plot_model_results(data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp", 
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)
In [47]:
arima_prophet_functions.model_performance(data_check_metrics, main_column)

Ljung-Box Test
      lb_stat  lb_pvalue
10  19.137908   0.038546

Rolling Prediction¶

In [48]:
final_data_check_metrics, model_first = arima_prophet_functions.rolling_prediction_function(data_used_model, main_column, (2,1,3), 
                                                                                            seasonal_order=(1,0,1,12), train_month_start=84, 
                                                                                            train_month_end=94, exog=data_used_model['COVID'])
In [49]:
arima_prophet_functions.calculate_model_metrics(model_first, final_data_check_metrics, main_column)
*** TRAIN ***
MAE: 218.59
MSE: 111355.65
RMSE: 333.70
MAPE: 12.93%
AIC: 909.0662050492497

*** TEST ***
MAE: 102.91
MSE: 15350.82
RMSE: 123.90
MAPE: 6.66%
Out[49]:
Dataset TRAIN TEST
MAE 218.589756 102.912664
MSE 111355.647097 15350.818098
RMSE 333.699936 123.898418
MAPE (%) 12.925759 6.657733
AIC 909.066205 NaN
In [50]:
arima_prophet_functions.plot_model_results(final_data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                         cutoff_date="12-30-2023",
                         ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp",
                                           vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)

Additional Things¶

GARCH (1, 1)¶

In [51]:
from arch import arch_model
In [52]:
residuals = data_check_metrics['residuals'].dropna()

# Fit GARCH model on residuals
garch_model = arch_model(residuals, vol='Garch', p=1, q=1)

# Fitting
garch_results = garch_model.fit(disp='off')

# Print model summary
print(garch_results.summary())
                     Constant Mean - GARCH Model Results                      
==============================================================================
Dep. Variable:              residuals   R-squared:                       0.000
Mean Model:             Constant Mean   Adj. R-squared:                  0.000
Vol Model:                      GARCH   Log-Likelihood:               -578.736
Distribution:                  Normal   AIC:                           1165.47
Method:            Maximum Likelihood   BIC:                           1175.20
                                        No. Observations:                   84
Date:                Thu, Jan 30 2025   Df Residuals:                       83
Time:                        19:51:25   Df Model:                            1
                               Mean Model                               
========================================================================
                 coef    std err          t      P>|t|  95.0% Conf. Int.
------------------------------------------------------------------------
mu            10.0397     25.757      0.390      0.697 [-40.444, 60.523]
                               Volatility Model                              
=============================================================================
                 coef    std err          t      P>|t|       95.0% Conf. Int.
-----------------------------------------------------------------------------
omega       2193.5119   8057.773      0.272      0.785 [-1.360e+04,1.799e+04]
alpha[1]       0.0000      0.265      0.000      1.000      [ -0.519,  0.519]
beta[1]        0.9180      0.439      2.089  3.667e-02    [5.686e-02,  1.779]
=============================================================================

Covariance estimator: robust
In [53]:
# Get the conditional volatility (sigma_t)
data_check_metrics['Volatility'] = garch_results.conditional_volatility

# Plot the volatility
plt.figure(figsize=(12, 5))
plt.plot(data_check_metrics['Date'], data_check_metrics['residuals'])
plt.plot(data_check_metrics['Date'], data_check_metrics['Volatility'], label="Estimated Volatility", color='orange')
plt.legend()
plt.title("GARCH Estimated Volatility")
plt.show()
In [54]:
# Forecast volatility for the next 12 periods (months, if monthly data)
forecast_horizon = 10
garch_forecast = garch_results.forecast(horizon=forecast_horizon)

# Extract forecasted conditional variance (square root to get standard deviation)
forecast_volatility = np.sqrt(garch_forecast.variance.values[:, :])

print("Forecasted Volatility for next periods:", forecast_volatility)
Forecasted Volatility for next periods: [[164.31716041 164.25875607 164.20511977 164.15586361 164.11063106
  164.06909441 164.03095249 163.99592854 163.96376828 163.93423804]]
In [55]:
garch_forecast.variance
Out[55]:
h.01 h.02 h.03 h.04 h.05 h.06 h.07 h.08 h.09 h.10
83 27000.129204 26980.938945 26963.321358 26947.147559 26932.299227 26918.66774 26906.153373 26894.664579 26884.117308 26874.434403

Prophet Model Choosing¶

In [56]:
from prophet import Prophet

Grid Search¶

In [57]:
# Подготовка данных
data_used_model = data_hosp_MR_used.copy()

# Train/Test split
data_model_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
data_model_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})

model = Prophet(growth='linear',
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    interval_width=0.8,
    changepoint_prior_scale=0.05,
                n_changepoints=25,
               seasonality_mode='multiplicative')

model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column)
metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)

# metrics_df['TEST'].loc['MAPE (%)'] = 9.554020571855224
19:51:25 - cmdstanpy - INFO - Chain [1] start processing
19:51:25 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.41
MSE: 18001.68
RMSE: 134.17
MAPE: 6.42%
AIC: 1161.4321715226604

*** TEST ***
MAE: 138.00
MSE: 46513.27
RMSE: 215.67
MAPE: 9.71%
In [58]:
param_grid = {
    'weekly_seasonality': [False, True],
    'daily_seasonality': [False, True],
    'changepoint_prior_scale': [0.01, 0.05, 0.1],
    'n_changepoints': [10, 25, 50],
    'seasonality_mode': ['additive', 'multiplicative']
}

grid = list(itertools.product(*param_grid.values()))
In [59]:
mape_lst = []

for param in grid:
    model = Prophet(growth='linear',
    yearly_seasonality=True,
    weekly_seasonality=param[0],
    daily_seasonality=param[1],
    interval_width=0.8,
    changepoint_prior_scale=param[2],
                n_changepoints=param[3],
               seasonality_mode=param[4])

    model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column, plotting=False)
    metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)

    dict_app = {"param": param, "mape_train":metrics_df['TRAIN'].loc['MAPE (%)'], "mape_test":metrics_df['TEST'].loc['MAPE (%)']}

    mape_lst.append(dict_app)
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
19:51:26 - cmdstanpy - INFO - Chain [1] done processing
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.40
MSE: 32801.00
RMSE: 181.11
MAPE: 8.36%
AIC: 1211.8316667542454

*** TEST ***
MAE: 201.15
MSE: 69661.03
RMSE: 263.93
MAPE: 13.66%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.59
MSE: 32767.79
RMSE: 181.02
MAPE: 8.38%
AIC: 1211.7465757465359

*** TEST ***
MAE: 196.31
MSE: 66701.20
RMSE: 258.27
MAPE: 13.34%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.40
MSE: 32800.44
RMSE: 181.11
MAPE: 8.36%
AIC: 1211.8302326700466

*** TEST ***
MAE: 201.45
MSE: 69802.37
RMSE: 264.20
MAPE: 13.68%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.59
MSE: 32767.90
RMSE: 181.02
MAPE: 8.37%
AIC: 1211.7468690093963

*** TEST ***
MAE: 196.00
MSE: 66580.07
RMSE: 258.03
MAPE: 13.32%
19:51:26 - cmdstanpy - INFO - Chain [1] done processing
19:51:26 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.39
MSE: 32800.62
RMSE: 181.11
MAPE: 8.36%
AIC: 1211.8307084322405

*** TEST ***
MAE: 202.21
MSE: 70169.69
RMSE: 264.90
MAPE: 13.74%
19:51:27 - cmdstanpy - INFO - Chain [1] done processing
19:51:27 - cmdstanpy - INFO - Chain [1] start processing
19:51:27 - cmdstanpy - INFO - Chain [1] done processing
19:51:27 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.56
MSE: 32770.58
RMSE: 181.03
MAPE: 8.38%
AIC: 1211.7537234821168

*** TEST ***
MAE: 197.35
MSE: 67083.07
RMSE: 259.00
MAPE: 13.42%
*** TRAIN ***
MAE: 142.39
MSE: 32794.91
RMSE: 181.09
MAPE: 8.36%
AIC: 1211.8160710657176

*** TEST ***
MAE: 201.75
MSE: 69940.45
RMSE: 264.46
MAPE: 13.71%
19:51:27 - cmdstanpy - INFO - Chain [1] done processing
19:51:27 - cmdstanpy - INFO - Chain [1] start processing
19:51:27 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 142.58
MSE: 32759.43
RMSE: 181.00
MAPE: 8.38%
AIC: 1211.7251484645333

*** TEST ***
MAE: 196.76
MSE: 66931.26
RMSE: 258.71
MAPE: 13.37%
19:51:27 - cmdstanpy - INFO - Chain [1] start processing
19:51:27 - cmdstanpy - INFO - Chain [1] done processing
19:51:27 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.38
MSE: 32796.04
RMSE: 181.10
MAPE: 8.36%
AIC: 1211.8189575866493

*** TEST ***
MAE: 201.65
MSE: 69888.98
RMSE: 264.37
MAPE: 13.70%
*** TRAIN ***
MAE: 142.57
MSE: 32757.38
RMSE: 180.99
MAPE: 8.37%
AIC: 1211.719903822935

*** TEST ***
MAE: 196.39
MSE: 66745.92
RMSE: 258.35
MAPE: 13.35%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing
19:51:28 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.36
MSE: 32790.47
RMSE: 181.08
MAPE: 8.36%
AIC: 1211.8046900707213

*** TEST ***
MAE: 201.25
MSE: 69700.30
RMSE: 264.01
MAPE: 13.67%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing
19:51:28 - cmdstanpy - INFO - Chain [1] start processing
19:51:28 - cmdstanpy - INFO - Chain [1] done processing
19:51:28 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.56
MSE: 32746.92
RMSE: 180.96
MAPE: 8.37%
AIC: 1211.693060898126

*** TEST ***
MAE: 196.58
MSE: 66814.37
RMSE: 258.48
MAPE: 13.36%
*** TRAIN ***
MAE: 141.31
MSE: 31956.17
RMSE: 178.76
MAPE: 8.30%
AIC: 1209.6397957372196

*** TEST ***
MAE: 211.42
MSE: 75436.71
RMSE: 274.66
MAPE: 14.43%
19:51:28 - cmdstanpy - INFO - Chain [1] done processing
19:51:28 - cmdstanpy - INFO - Chain [1] start processing
19:51:29 - cmdstanpy - INFO - Chain [1] done processing
19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 140.63
MSE: 31548.39
RMSE: 177.62
MAPE: 8.26%
AIC: 1208.561008857982

*** TEST ***
MAE: 212.72
MSE: 76316.66
RMSE: 276.25
MAPE: 14.53%
*** TRAIN ***
MAE: 141.30
MSE: 31963.70
RMSE: 178.78
MAPE: 8.30%
AIC: 1209.659601587275

*** TEST ***
MAE: 211.58
MSE: 75538.79
RMSE: 274.84
MAPE: 14.44%
19:51:29 - cmdstanpy - INFO - Chain [1] done processing
19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 140.64
MSE: 31550.61
RMSE: 177.62
MAPE: 8.26%
AIC: 1208.5669146851637

*** TEST ***
MAE: 212.54
MSE: 76193.88
RMSE: 276.03
MAPE: 14.51%
19:51:29 - cmdstanpy - INFO - Chain [1] done processing
19:51:29 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 141.30
MSE: 31954.82
RMSE: 178.76
MAPE: 8.30%
AIC: 1209.6362502297068

*** TEST ***
MAE: 211.74
MSE: 75631.62
RMSE: 275.01
MAPE: 14.45%
19:51:30 - cmdstanpy - INFO - Chain [1] done processing
19:51:30 - cmdstanpy - INFO - Chain [1] start processing
19:51:30 - cmdstanpy - INFO - Chain [1] done processing
19:51:30 - cmdstanpy - INFO - Chain [1] start processing
19:51:30 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 140.80
MSE: 31587.62
RMSE: 177.73
MAPE: 8.28%
AIC: 1208.6653961907946

*** TEST ***
MAE: 211.24
MSE: 75221.15
RMSE: 274.26
MAPE: 14.43%
*** TRAIN ***
MAE: 142.39
MSE: 32800.34
RMSE: 181.11
MAPE: 8.36%
AIC: 1211.829973143091

*** TEST ***
MAE: 201.71
MSE: 69919.14
RMSE: 264.42
MAPE: 13.70%
19:51:30 - cmdstanpy - INFO - Chain [1] start processing
19:51:30 - cmdstanpy - INFO - Chain [1] done processing
19:51:30 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.62
MSE: 32768.03
RMSE: 181.02
MAPE: 8.38%
AIC: 1211.7471897504593

*** TEST ***
MAE: 196.06
MSE: 66583.33
RMSE: 258.04
MAPE: 13.33%
*** TRAIN ***
MAE: 142.39
MSE: 32800.34
RMSE: 181.11
MAPE: 8.36%
AIC: 1211.8299803493007

*** TEST ***
MAE: 201.69
MSE: 69912.49
RMSE: 264.41
MAPE: 13.70%
19:51:30 - cmdstanpy - INFO - Chain [1] done processing
19:51:30 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.36
MSE: 32772.97
RMSE: 181.03
MAPE: 8.37%
AIC: 1211.7598504525342

*** TEST ***
MAE: 196.15
MSE: 66811.70
RMSE: 258.48
MAPE: 13.34%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing
19:51:31 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.45
MSE: 32801.02
RMSE: 181.11
MAPE: 8.37%
AIC: 1211.8317235026589

*** TEST ***
MAE: 202.53
MSE: 70263.13
RMSE: 265.07
MAPE: 13.75%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing
19:51:31 - cmdstanpy - INFO - Chain [1] start processing
19:51:31 - cmdstanpy - INFO - Chain [1] done processing
19:51:31 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.34
MSE: 32773.47
RMSE: 181.03
MAPE: 8.37%
AIC: 1211.7611503883836

*** TEST ***
MAE: 196.03
MSE: 66771.15
RMSE: 258.40
MAPE: 13.33%
*** TRAIN ***
MAE: 142.36
MSE: 32795.33
RMSE: 181.09
MAPE: 8.36%
AIC: 1211.817139775301

*** TEST ***
MAE: 201.45
MSE: 69818.73
RMSE: 264.23
MAPE: 13.69%
19:51:31 - cmdstanpy - INFO - Chain [1] done processing
19:51:31 - cmdstanpy - INFO - Chain [1] start processing
19:51:31 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 142.58
MSE: 32752.27
RMSE: 180.98
MAPE: 8.37%
AIC: 1211.7067887604167

*** TEST ***
MAE: 196.34
MSE: 66711.19
RMSE: 258.29
MAPE: 13.35%
19:51:31 - cmdstanpy - INFO - Chain [1] start processing
19:51:32 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 142.39
MSE: 32795.69
RMSE: 181.10
MAPE: 8.36%
AIC: 1211.818072129617

*** TEST ***
MAE: 201.74
MSE: 69934.01
RMSE: 264.45
MAPE: 13.71%
19:51:32 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.57
MSE: 32751.16
RMSE: 180.97
MAPE: 8.37%
AIC: 1211.7039440549875

*** TEST ***
MAE: 196.22
MSE: 66653.47
RMSE: 258.17
MAPE: 13.34%
19:51:32 - cmdstanpy - INFO - Chain [1] done processing
19:51:32 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.38
MSE: 32782.34
RMSE: 181.06
MAPE: 8.36%
AIC: 1211.7838819515441

*** TEST ***
MAE: 201.99
MSE: 70047.40
RMSE: 264.66
MAPE: 13.72%
19:51:32 - cmdstanpy - INFO - Chain [1] done processing
19:51:32 - cmdstanpy - INFO - Chain [1] start processing
19:51:32 - cmdstanpy - INFO - Chain [1] done processing
19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 142.47
MSE: 32718.27
RMSE: 180.88
MAPE: 8.37%
AIC: 1211.619539455928

*** TEST ***
MAE: 196.37
MSE: 66767.81
RMSE: 258.39
MAPE: 13.35%
*** TRAIN ***
MAE: 141.30
MSE: 31951.45
RMSE: 178.75
MAPE: 8.30%
AIC: 1209.6274014812896

*** TEST ***
MAE: 211.76
MSE: 75640.61
RMSE: 275.03
MAPE: 14.45%
19:51:33 - cmdstanpy - INFO - Chain [1] done processing
19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 126.78
MSE: 25484.13
RMSE: 159.64
MAPE: 7.38%
AIC: 1190.6298165258954

*** TEST ***
MAE: 166.48
MSE: 43661.03
RMSE: 208.95
MAPE: 11.22%
19:51:33 - cmdstanpy - INFO - Chain [1] done processing
19:51:33 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 141.61
MSE: 32091.08
RMSE: 179.14
MAPE: 8.32%
AIC: 1209.9936684566535

*** TEST ***
MAE: 209.54
MSE: 74114.53
RMSE: 272.24
MAPE: 14.29%
19:51:34 - cmdstanpy - INFO - Chain [1] done processing
19:51:34 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 127.78
MSE: 25610.41
RMSE: 160.03
MAPE: 7.45%
AIC: 1191.0450200695864

*** TEST ***
MAE: 167.00
MSE: 43445.94
RMSE: 208.44
MAPE: 11.23%
19:51:34 - cmdstanpy - INFO - Chain [1] done processing
19:51:34 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 141.30
MSE: 31953.40
RMSE: 178.76
MAPE: 8.30%
AIC: 1209.6325266563463

*** TEST ***
MAE: 211.82
MSE: 75684.75
RMSE: 275.11
MAPE: 14.46%
19:51:35 - cmdstanpy - INFO - Chain [1] done processing
19:51:35 - cmdstanpy - INFO - Chain [1] start processing
19:51:35 - cmdstanpy - INFO - Chain [1] done processing
19:51:35 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 126.84
MSE: 25123.84
RMSE: 158.51
MAPE: 7.39%
AIC: 1189.4337481491486

*** TEST ***
MAE: 166.13
MSE: 42857.94
RMSE: 207.02
MAPE: 11.16%
*** TRAIN ***
MAE: 106.04
MSE: 17983.82
RMSE: 134.10
MAPE: 6.40%
AIC: 1161.3487833789147

*** TEST ***
MAE: 137.95
MSE: 47899.78
RMSE: 218.86
MAPE: 9.68%
19:51:35 - cmdstanpy - INFO - Chain [1] done processing
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
19:51:36 - cmdstanpy - INFO - Chain [1] done processing
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.49
MSE: 18027.10
RMSE: 134.27
MAPE: 6.42%
AIC: 1161.5507381736484

*** TEST ***
MAE: 137.80
MSE: 46284.04
RMSE: 215.14
MAPE: 9.69%
*** TRAIN ***
MAE: 105.97
MSE: 17984.66
RMSE: 134.11
MAPE: 6.40%
AIC: 1161.3527042991773

*** TEST ***
MAE: 137.92
MSE: 47963.03
RMSE: 219.00
MAPE: 9.67%
19:51:36 - cmdstanpy - INFO - Chain [1] done processing
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
19:51:36 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.50
MSE: 18027.48
RMSE: 134.27
MAPE: 6.42%
AIC: 1161.5524758182455

*** TEST ***
MAE: 138.20
MSE: 46500.07
RMSE: 215.64
MAPE: 9.71%
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.25
MSE: 17982.79
RMSE: 134.10
MAPE: 6.41%
AIC: 1161.3440065418965

*** TEST ***
MAE: 137.44
MSE: 47331.86
RMSE: 217.56
MAPE: 9.63%
19:51:36 - cmdstanpy - INFO - Chain [1] done processing
19:51:36 - cmdstanpy - INFO - Chain [1] start processing
19:51:37 - cmdstanpy - INFO - Chain [1] done processing
19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.43
MSE: 18030.12
RMSE: 134.28
MAPE: 6.42%
AIC: 1161.5647924541381

*** TEST ***
MAE: 138.86
MSE: 47111.08
RMSE: 217.05
MAPE: 9.78%
*** TRAIN ***
MAE: 106.17
MSE: 17977.40
RMSE: 134.08
MAPE: 6.41%
AIC: 1161.3188211637055

*** TEST ***
MAE: 137.41
MSE: 47410.59
RMSE: 217.74
MAPE: 9.63%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing
19:51:37 - cmdstanpy - INFO - Chain [1] start processing
19:51:37 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.05
MSE: 17881.67
RMSE: 133.72
MAPE: 6.39%
AIC: 1160.8703092476076

*** TEST ***
MAE: 137.73
MSE: 46769.56
RMSE: 216.26
MAPE: 9.71%
19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.13
MSE: 17961.46
RMSE: 134.02
MAPE: 6.40%
AIC: 1161.2442876564796

*** TEST ***
MAE: 137.44
MSE: 47474.69
RMSE: 217.89
MAPE: 9.64%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing
19:51:37 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.41
MSE: 18001.68
RMSE: 134.17
MAPE: 6.42%
AIC: 1161.4321715226604

*** TEST ***
MAE: 138.00
MSE: 46513.27
RMSE: 215.67
MAPE: 9.71%
19:51:37 - cmdstanpy - INFO - Chain [1] done processing
19:51:38 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.15
MSE: 17968.68
RMSE: 134.05
MAPE: 6.41%
AIC: 1161.2780455124705

*** TEST ***
MAE: 137.50
MSE: 47481.62
RMSE: 217.90
MAPE: 9.64%
19:51:38 - cmdstanpy - INFO - Chain [1] done processing
19:51:38 - cmdstanpy - INFO - Chain [1] start processing
19:51:38 - cmdstanpy - INFO - Chain [1] done processing
19:51:38 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.00
MSE: 17867.13
RMSE: 133.67
MAPE: 6.39%
AIC: 1160.8019883422219

*** TEST ***
MAE: 137.69
MSE: 46800.76
RMSE: 216.33
MAPE: 9.71%
*** TRAIN ***
MAE: 104.22
MSE: 17409.11
RMSE: 131.94
MAPE: 6.29%
AIC: 1158.6205991769664

*** TEST ***
MAE: 137.42
MSE: 50423.11
RMSE: 224.55
MAPE: 9.77%
19:51:38 - cmdstanpy - INFO - Chain [1] done processing
19:51:38 - cmdstanpy - INFO - Chain [1] start processing
19:51:38 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 103.95
MSE: 17323.51
RMSE: 131.62
MAPE: 6.27%
AIC: 1158.2065347614514

*** TEST ***
MAE: 141.38
MSE: 50892.06
RMSE: 225.59
MAPE: 10.04%
19:51:38 - cmdstanpy - INFO - Chain [1] start processing
19:51:39 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 104.21
MSE: 17408.77
RMSE: 131.94
MAPE: 6.29%
AIC: 1158.6189284450898

*** TEST ***
MAE: 137.12
MSE: 50268.29
RMSE: 224.21
MAPE: 9.75%
19:51:39 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 103.95
MSE: 17324.57
RMSE: 131.62
MAPE: 6.27%
AIC: 1158.2116976583309

*** TEST ***
MAE: 141.19
MSE: 50797.02
RMSE: 225.38
MAPE: 10.03%
19:51:39 - cmdstanpy - INFO - Chain [1] done processing
19:51:39 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 104.18
MSE: 17399.29
RMSE: 131.91
MAPE: 6.29%
AIC: 1158.5731800224462

*** TEST ***
MAE: 137.20
MSE: 50316.80
RMSE: 224.31
MAPE: 9.76%
19:51:39 - cmdstanpy - INFO - Chain [1] done processing
19:51:39 - cmdstanpy - INFO - Chain [1] start processing
19:51:40 - cmdstanpy - INFO - Chain [1] done processing
19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 103.96
MSE: 17326.88
RMSE: 131.63
MAPE: 6.27%
AIC: 1158.2228920325133

*** TEST ***
MAE: 141.17
MSE: 50776.34
RMSE: 225.34
MAPE: 10.03%
*** TRAIN ***
MAE: 106.18
MSE: 17985.53
RMSE: 134.11
MAPE: 6.41%
AIC: 1161.3567690185905

*** TEST ***
MAE: 137.24
MSE: 47156.26
RMSE: 217.15
MAPE: 9.61%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing
19:51:40 - cmdstanpy - INFO - Chain [1] start processing
19:51:40 - cmdstanpy - INFO - Chain [1] done processing
19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.52
MSE: 18027.88
RMSE: 134.27
MAPE: 6.42%
AIC: 1161.554372704546

*** TEST ***
MAE: 138.52
MSE: 46710.96
RMSE: 216.13
MAPE: 9.74%
*** TRAIN ***
MAE: 106.18
MSE: 17982.64
RMSE: 134.10
MAPE: 6.41%
AIC: 1161.3433143211632

*** TEST ***
MAE: 137.55
MSE: 47477.13
RMSE: 217.89
MAPE: 9.64%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing
19:51:40 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.50
MSE: 18027.44
RMSE: 134.27
MAPE: 6.42%
AIC: 1161.552308945461

*** TEST ***
MAE: 138.03
MSE: 46439.50
RMSE: 215.50
MAPE: 9.70%
19:51:40 - cmdstanpy - INFO - Chain [1] done processing
19:51:41 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.28
MSE: 17982.96
RMSE: 134.10
MAPE: 6.41%
AIC: 1161.3447709548673

*** TEST ***
MAE: 137.40
MSE: 47291.94
RMSE: 217.47
MAPE: 9.63%
19:51:41 - cmdstanpy - INFO - Chain [1] done processing
19:51:41 - cmdstanpy - INFO - Chain [1] start processing
19:51:41 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.54
MSE: 18027.82
RMSE: 134.27
MAPE: 6.42%
AIC: 1161.55407501765

*** TEST ***
MAE: 138.34
MSE: 46666.54
RMSE: 216.02
MAPE: 9.73%
*** TRAIN ***
MAE: 106.16
MSE: 17973.81
RMSE: 134.07
MAPE: 6.41%
AIC: 1161.302058713311

*** TEST ***
MAE: 137.43
MSE: 47430.47
RMSE: 217.79
MAPE: 9.63%
19:51:41 - cmdstanpy - INFO - Chain [1] start processing
19:51:41 - cmdstanpy - INFO - Chain [1] done processing
19:51:41 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 78.39
MSE: 10347.81
RMSE: 101.72
MAPE: 4.66%
AIC: 1114.9222427207478

*** TEST ***
MAE: 137.22
MSE: 40949.61
RMSE: 202.36
MAPE: 9.34%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing
19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.16
MSE: 17973.86
RMSE: 134.07
MAPE: 6.41%
AIC: 1161.3022664464231

*** TEST ***
MAE: 137.48
MSE: 47460.77
RMSE: 217.85
MAPE: 9.64%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing
19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.44
MSE: 17999.91
RMSE: 134.16
MAPE: 6.42%
AIC: 1161.423904386426

*** TEST ***
MAE: 138.07
MSE: 46524.80
RMSE: 215.70
MAPE: 9.71%
19:51:42 - cmdstanpy - INFO - Chain [1] done processing
19:51:42 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 106.12
MSE: 17960.55
RMSE: 134.02
MAPE: 6.40%
AIC: 1161.2400553588784

*** TEST ***
MAE: 137.26
MSE: 47378.64
RMSE: 217.67
MAPE: 9.62%
19:51:44 - cmdstanpy - INFO - Chain [1] done processing
19:51:44 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 73.85
MSE: 8935.99
RMSE: 94.53
MAPE: 4.38%
AIC: 1102.6004215176854

*** TEST ***
MAE: 138.75
MSE: 40876.14
RMSE: 202.18
MAPE: 9.58%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing
19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 104.22
MSE: 17407.91
RMSE: 131.94
MAPE: 6.29%
AIC: 1158.6147988247596

*** TEST ***
MAE: 137.41
MSE: 50412.70
RMSE: 224.53
MAPE: 9.77%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing
19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 76.03
MSE: 10065.00
RMSE: 100.32
MAPE: 4.52%
AIC: 1112.594468393507

*** TEST ***
MAE: 140.32
MSE: 40447.70
RMSE: 201.12
MAPE: 9.52%
19:51:45 - cmdstanpy - INFO - Chain [1] done processing
19:51:45 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 104.21
MSE: 17406.04
RMSE: 131.93
MAPE: 6.29%
AIC: 1158.605747442009

*** TEST ***
MAE: 137.34
MSE: 50371.84
RMSE: 224.44
MAPE: 9.77%
19:51:47 - cmdstanpy - INFO - Chain [1] done processing
19:51:47 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 67.21
MSE: 7282.93
RMSE: 85.34
MAPE: 4.00%
AIC: 1085.417928875907

*** TEST ***
MAE: 168.19
MSE: 56249.75
RMSE: 237.17
MAPE: 11.77%
19:51:47 - cmdstanpy - INFO - Chain [1] done processing
19:51:47 - cmdstanpy - INFO - Chain [1] start processing
*** TRAIN ***
MAE: 104.20
MSE: 17404.68
RMSE: 131.93
MAPE: 6.29%
AIC: 1158.5991978988607

*** TEST ***
MAE: 137.32
MSE: 50369.69
RMSE: 224.43
MAPE: 9.77%
19:51:48 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 38.17
MSE: 2551.88
RMSE: 50.52
MAPE: 2.28%
AIC: 997.326940622734

*** TEST ***
MAE: 228.10
MSE: 91715.28
RMSE: 302.85
MAPE: 15.66%
In [60]:
results_grid = pd.DataFrame(mape_lst)
results_grid
Out[60]:
param mape_train mape_test
0 (False, False, 0.01, 10, additive) 8.361144 13.662470
1 (False, False, 0.01, 10, multiplicative) 8.375450 13.342177
2 (False, False, 0.01, 25, additive) 8.362008 13.684633
3 (False, False, 0.01, 25, multiplicative) 8.374638 13.319802
4 (False, False, 0.01, 50, additive) 8.364424 13.737819
... ... ... ...
67 (True, True, 0.1, 10, multiplicative) 4.517779 9.520318
68 (True, True, 0.1, 25, additive) 6.290986 9.766588
69 (True, True, 0.1, 25, multiplicative) 4.002654 11.770559
70 (True, True, 0.1, 50, additive) 6.290383 9.765437
71 (True, True, 0.1, 50, multiplicative) 2.282970 15.656989

72 rows × 3 columns

In [61]:
results_grid['MAPE_Sum'] = results_grid['mape_train'] + results_grid['mape_test']
In [62]:
results_grid.sort_values('mape_test')
Out[62]:
param mape_train mape_test MAPE_Sum
61 (True, True, 0.05, 10, multiplicative) 4.659438 9.341163 14.000601
67 (True, True, 0.1, 10, multiplicative) 4.517779 9.520318 14.038097
65 (True, True, 0.05, 50, multiplicative) 4.379537 9.578923 13.958460
54 (True, True, 0.01, 10, additive) 6.409693 9.608355 16.018049
64 (True, True, 0.05, 50, additive) 6.403911 9.623915 16.027826
... ... ... ... ...
30 (False, True, 0.1, 10, additive) 8.301276 14.452606 22.753882
34 (False, True, 0.1, 50, additive) 8.301606 14.457283 22.758889
15 (False, False, 0.1, 25, multiplicative) 8.264668 14.514892 22.779560
13 (False, False, 0.1, 10, multiplicative) 8.264064 14.527584 22.791647
71 (True, True, 0.1, 50, multiplicative) 2.282970 15.656989 17.939958

72 rows × 4 columns

Best Model¶

In [63]:
param = (True, False, 0.01, 50, "additive")
model = Prophet(growth='linear',
    yearly_seasonality=True,
    weekly_seasonality=param[0],
    daily_seasonality=param[1],
    interval_width=0.8,
    changepoint_prior_scale=param[2],
                n_changepoints=param[3],
               seasonality_mode=param[4])
In [64]:
# Подготовка данных
data_used_model = data_hosp_MR_used.copy()

# Train/Test split
data_model_train = data_used_model.loc[data_used_model.Date < pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
data_model_test = data_used_model.loc[data_used_model.Date >= pd.to_datetime("2024-01-01")].rename(columns={'Date': 'ds', main_column: 'y'})
In [65]:
model, data_check_metrics, forecast = arima_prophet_functions.prophet_model_train(model, data_model_train, data_model_test, main_column=main_column)
metrics_df = arima_prophet_functions.calculate_model_metrics_prophet(data_check_metrics, main_column)
arima_prophet_functions.calcualate_model_performance_prophet(model, data_check_metrics, main_column)
19:51:48 - cmdstanpy - INFO - Chain [1] start processing
19:51:49 - cmdstanpy - INFO - Chain [1] done processing
*** TRAIN ***
MAE: 106.25
MSE: 17982.79
RMSE: 134.10
MAPE: 6.41%
AIC: 1161.3440065418965

*** TEST ***
MAE: 137.44
MSE: 47331.86
RMSE: 217.56
MAPE: 9.63%


Ljung-Box Test
      lb_stat  lb_pvalue
10  36.978557   0.000057
In [66]:
metrics_df
Out[66]:
Dataset TRAIN TEST
MAE 106.247953 137.438717
MSE 17982.792915 47331.864539
RMSE 134.099936 217.558876
MAPE (%) 6.409770 9.631075
AIC 1161.344007 NaN
In [67]:
arima_prophet_functions.plot_model_results_prophet(data_check_metrics, main_column,
                   date_column='Date_Graph', 
                         prediction_column=f"{main_column}_Prediction", fitted_column=f"{main_column}_Fitted",
                           ci_low_column=f"{main_column}_Prediction_CI_low", ci_upp_column=f"{main_column}_Prediction_CI_upp", 
                         cutoff_date="12-30-2023", vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None)
In [68]:
import pandas as pd
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
import warnings
import itertools

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import het_breuschpagan

### ARIMA Functions
def train_arima_model(data, order, train_month=84, seasonal_order=None, exog=None, trend=None):
    """
    Function to train ARIMA/ARIMAX/SARIMAX models.
    
    Parameters:
        data (pd.Series): Time series data (index should be datetime-like).
        order (tuple): ARIMA order (p, d, q).
        seasonal_order (tuple, optional): Seasonal order (P, D, Q, s).
        exog (pd.DataFrame, optional): External regressors for ARIMAX.
        train_split (float or int): Fraction or number of data points for training.
        trend (str, optional): Trend component ('n', 'c', 't', 'ct').
    
    Returns:
        dict: A dictionary containing the model, predictions, and performance metrics.
    """
    # Split the data into training and testing sets 
    train_data = data.iloc[:train_month]
    test_data = data.iloc[train_month:]

    train_size = train_data.shape[0]
    
    if exog is not None:
        exog_train = np.array(exog.iloc[:train_size])
        exog_test = np.array(exog.iloc[train_size:])
    else:
        exog_train, exog_test = None, None
    
    # Define the model
    model = SARIMAX(train_data,
                    order=order,
                    seasonal_order=seasonal_order if seasonal_order else (0, 0, 0, 0),
                    exog=exog_train,
                    trend=trend,
                    enforce_stationarity=False,
                    enforce_invertibility=False)
    
    # Fit the model
    model = model.fit()
    
    # In-sample predictions
    train_predictions = model.fittedvalues
    
    return model

def make_metrics_dataframe(model, data_model_train, data_model_test, main_column, exog_test=None, plotting=True, *args, **kwargs):
    # Create a DataFrame for future predictions (extend the time range if necessary)
    forecast_steps = data_model_test.shape[0]  # Number of periods to forecast
    forecast = model.get_forecast(steps=forecast_steps, exog=exog_test)
    forecast_values = forecast.predicted_mean
    forecast_ci = forecast.conf_int()

    # Make final dataset
    train_size = data_model_train.shape[0]
    test_size = data_model_test.shape[0]
    
    data_check_metrics = pd.concat([data_model_train, data_model_test])
    data_check_metrics = data_check_metrics[['Date', 'Date_Graph', main_column]]
    data_check_metrics.loc[:, f"{main_column}_Fitted"] = model.fittedvalues
    data_check_metrics.loc[:, f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_size), forecast_values])
    data_check_metrics.loc[:, f"{main_column}_Prediction_CI_low"] = pd.concat([pd.Series([None]*train_size), forecast_ci[f'lower {main_column}']])
    data_check_metrics.loc[:, f"{main_column}_Prediction_CI_upp"] = pd.concat([pd.Series([None]*train_size), forecast_ci[f'upper {main_column}']])
    
    data_check_metrics = data_check_metrics.loc[data_check_metrics.Date <= pd.to_datetime("2024-10-10")]
    return data_check_metrics

def calculate_model_metrics(model, data_check_metrics, main_column):
    metrics = []  # List to store metrics for TRAIN and TEST

    # Calculate metrics - TRAIN
    mae_train = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]))
    mse_train = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) ** 2)
    rmse_train = np.sqrt(mse_train)
    mape_train = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) / data_check_metrics[main_column])) * 100
    aic_train = model.aic

    # Store TRAIN metrics in the list
    metrics.append({
        "Dataset": "TRAIN",
        "MAE": mae_train,
        "MSE": mse_train,
        "RMSE": rmse_train,
        "MAPE (%)": mape_train,
        "AIC": aic_train
    })

    # Calculate metrics - TEST
    mae_test = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]))
    mse_test = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) ** 2)
    rmse_test = np.sqrt(mse_test)
    mape_test = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) / data_check_metrics[main_column])) * 100

    # Store TEST metrics in the list
    metrics.append({
        "Dataset": "TEST",
        "MAE": mae_test,
        "MSE": mse_test,
        "RMSE": rmse_test,
        "MAPE (%)": mape_test,
        "AIC": None
    })

    # Convert metrics to a DataFrame
    metrics_df = pd.DataFrame(metrics)
    metrics_df.index = metrics_df['Dataset']
    metrics_df.drop(columns=['Dataset'], inplace=True)
    metrics_df = metrics_df.T

    # Print the metrics
    print("*** TRAIN ***")
    print(f"MAE: {mae_train:.2f}")
    print(f"MSE: {mse_train:.2f}")
    print(f"RMSE: {rmse_train:.2f}")
    print(f"MAPE: {mape_train:.2f}%")
    print(f"AIC: {aic_train}")

    print("\n*** TEST ***")
    print(f"MAE: {mae_test:.2f}")
    print(f"MSE: {mse_test:.2f}")
    print(f"RMSE: {rmse_test:.2f}")
    print(f"MAPE: {mape_test:.2f}%")

    return metrics_df    

def model_performance(data_check_metrics, main_column):
    # Assume `df` is your DataFrame with columns 'real' and 'fitted'
    train_size = data_check_metrics[f"{main_column}_Fitted"].dropna().shape[0]
    # Replace 'real' and 'fitted' with your actual column names
    real = data_check_metrics[main_column].iloc[0:train_size]
    fitted = data_check_metrics[f"{main_column}_Fitted"].iloc[0:train_size]
    
    # Calculate residuals
    data_check_metrics['residuals'] = real - fitted
    residuals = data_check_metrics['residuals'].dropna()

    from statsmodels.stats.diagnostic import acorr_ljungbox

    # Perform the Ljung-Box test on residuals
    # df['residuals'] should already be computed
    ljung_box_results = acorr_ljungbox(data_check_metrics['residuals'].dropna(), lags=[10], return_df=True)
    
    # Display results
    print('\n')
    print('Ljung-Box Test')
    print(ljung_box_results)

    # test_stat, p_value, _, _ = het_breuschpagan(residuals, exog_het=np.arange(len(residuals)))
    # # Display results
    # print('\n')
    # print('Breusch-Pagan Test')
    # print(test_stat, p_value)

    # Residuals plot
    plt.figure(figsize=(10, 6))
    plt.plot(residuals, label='Residuals', color='blue')
    plt.axhline(0, color='red', linestyle='--', linewidth=1)
    plt.title('Residuals Over Time')
    plt.xlabel('Time')
    plt.ylabel('Residuals')
    plt.legend()
    plt.show()
    
    # Histogram of residuals
    plt.figure(figsize=(8, 5))
    plt.hist(residuals, bins=30, color='gray', edgecolor='black')
    plt.title('Histogram of Residuals')
    plt.xlabel('Residual Value')
    plt.ylabel('Frequency')
    plt.show()
    
    # ACF and PACF of residuals
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plot_acf(residuals, lags=10, ax=plt.gca(), title="ACF of Residuals")
    plt.subplot(1, 2, 2)
    plot_pacf(residuals, lags=10, ax=plt.gca(), title="PACF of Residuals")
    plt.tight_layout()
    plt.show()

def rolling_prediction_function(data_used_model, main_column, arima_order, seasonal_order=None, train_month_start=84, train_month_end=94, exog=None):
    data_used_train = data_used_model.iloc[:train_month_start]
    data_used_test = data_used_model.iloc[train_month_start:]
    
    rolling_prediction = []
    rolling_prediction_ci_low = []
    rolling_prediction_ci_upp = []
    
    model_first = train_arima_model(data_used_model[main_column], arima_order, train_month=train_month_start, exog=exog, seasonal_order=seasonal_order)
    if exog is not None:
        first_data_check_metrics = make_metrics_dataframe(model_first, data_used_train, data_used_test, main_column, exog_test=exog.loc[data_used_test.index])
    else:
        first_data_check_metrics = make_metrics_dataframe(model_first, data_used_train, data_used_test, main_column)
    
    for train_month in range(train_month_start, train_month_end):
        #print(train_month)
        model = train_arima_model(data_used_model[main_column], arima_order, train_month=train_month, seasonal_order=seasonal_order, exog=exog)
        if exog is not None:
            data_check_metrics = make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=np.array(exog.iloc[train_month_start:]))
        else:
            data_check_metrics = make_metrics_dataframe(model, data_used_train, data_used_test, main_column, exog_test=None)
        rolling_prediction.append(data_check_metrics[f"{main_column}_Prediction"].dropna().iloc[0])
        rolling_prediction_ci_low.append(data_check_metrics[f"{main_column}_Prediction_CI_low"].dropna().iloc[0])
        rolling_prediction_ci_upp.append(data_check_metrics[f"{main_column}_Prediction_CI_upp"].dropna().iloc[0])
    
    final_data_check_metrics = first_data_check_metrics.copy()
    final_data_check_metrics[f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction)], ignore_index=True)
    final_data_check_metrics[f"{main_column}_Prediction_CI_low"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction_ci_low)], ignore_index=True)
    final_data_check_metrics[f"{main_column}_Prediction_CI_upp"] = pd.concat([pd.Series([None]*train_month_start), pd.Series(rolling_prediction_ci_upp)], ignore_index=True)

    return final_data_check_metrics, model_first

def grid_search_arima(data, min_p, max_p, min_q, max_q, d, 
                      min_P, max_P, min_Q, max_Q, 
                      min_D, max_D, seasonal_period, exog=None,
                     trends=None):
    """
    Perform a grid search to find the best ARIMA/SARIMA model.
    
    Parameters:
        data (pd.Series): Time series data (index should be datetime-like).
        min_p (int): Minimum value for p (AR order).
        max_p (int): Maximum value for p (AR order).
        min_q (int): Minimum value for q (MA order).
        max_q (int): Maximum value for q (MA order).
        d (int): Differencing order.
        min_P (int): Minimum value for seasonal P (seasonal AR order).
        max_P (int): Maximum value for seasonal P (seasonal AR order).
        min_Q (int): Minimum value for seasonal Q (seasonal MA order).
        max_Q (int): Maximum value for seasonal Q (seasonal MA order).
        min_D (int): Minimum value for seasonal D (seasonal differencing).
        max_D (int): Maximum value for seasonal D (seasonal differencing).
        seasonal_period (int): Seasonal period (s).
        exog (pd.DataFrame, optional): External regressors for ARIMAX.
        
    Returns:
        pd.DataFrame: Grid search results with model orders and AIC values.
    """
    results = []
    if not trends:
        trends = ['n', 'c', 't', 'ct']  # Trend options
    
    # Generate parameter grid
    for p, q, P, Q, D, trend in itertools.product(
        range(min_p, max_p + 1),
        range(min_q, max_q + 1),
        range(min_P, max_P + 1),
        range(min_Q, max_Q + 1),
        range(min_D, max_D + 1),
        trends
    ):
        try:
            # Fit SARIMAX model
            model = train_arima_model(data, order=(p, d, q), seasonal_order=(P, D, Q, seasonal_period), trend=trend,
                                     exog=exog)
            
            # Store results
            results.append({
                'p': p,
                'q': q,
                'd': d,
                'P': P,
                'Q': Q,
                'D': D,
                'seasonal_period': seasonal_period,
                'trend': trend,
                'AIC': model.aic
            })
        except Exception as e:
            print(p,q,d,P,Q,D)
            print('error')
            continue
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

def plot_model_results(data, main_column, date_column='Date_Graph', 
                         prediction_column=None, fitted_column=None,
                         ci_low_column=None, ci_upp_column=None, 
                         cutoff_date=None, vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None,
                      figsize=None):
    """
    Plot Prophet model results, including actual, fitted, and predicted values with confidence intervals.

    Parameters:
        data (pd.DataFrame): DataFrame containing the time series data.
        main_column (str): Column name for the actual values.
        date_column (str): Column name for the date. Default is 'Date_Graph'.
        prediction_column (str): Column name for predictions.
        fitted_column (str): Column name for fitted values.
        ci_low_column (str): Column name for lower confidence interval.
        ci_upp_column (str): Column name for upper confidence interval.
        cutoff_date (str): Date to filter the data for predictions. Format: 'YYYY-MM-DD'.
        vertical_line_date (str): Date for a vertical line. Format: 'YYYY-MM-DD'.
        vertical_line_ymin (float): Y-axis minimum for the vertical line.
        vertical_line_ymax (float): Y-axis maximum for the vertical line.

    Returns:
        None: Displays the plot.
    """
    if figsize is None:
        figsize = (10, 6)
    # Ensure date_column is in datetime format
    data[date_column] = pd.to_datetime(data[date_column], errors='coerce')

    # Plot the data
    plt.figure(figsize=figsize)
    plt.plot(data[date_column], data[main_column], label='Actual', color="orange", alpha=0.7)

    if fitted_column:
        plt.plot(data[date_column], data[fitted_column], label='Fitted', color='blue', linestyle="dashed", linewidth=3)

    if prediction_column:
        plt.plot(data[date_column], data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')

    if vertical_line_date:
        plt.vlines(x=pd.to_datetime(vertical_line_date), linestyles='--', color='orange', ymin=vertical_line_ymin, ymax=vertical_line_ymax)

    # Add titles and labels
    plt.title('Model - Actual vs Fitted and Forecast')
    plt.xlabel('Date')
    plt.ylabel('Load')
    plt.ylim(data[main_column].min() - 200, data[main_column].max() + 200)
    plt.legend()
    plt.grid()

    # Xticks
    xticks = data[date_column][::2]  # Select every second tick
    plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
    plt.show()

    if cutoff_date:
        # Filter the dataset for plotting predictions
        filtered_data = data.loc[data[date_column] > pd.to_datetime(cutoff_date)]
        
        if prediction_column:
            filtered_data[prediction_column] = filtered_data[prediction_column].fillna(filtered_data[fitted_column])

        # Plot the filtered data
        plt.figure(figsize=figsize)
        plt.plot(filtered_data[date_column], filtered_data[main_column], label='Actual', color="orange", alpha=0.7)

        if prediction_column:
            plt.plot(filtered_data[date_column], filtered_data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')

        if ci_low_column and ci_upp_column:
            filtered_data.loc[filtered_data["Date"] == pd.to_datetime("2023-12-01"), 
                f"{main_column}_Prediction"] = data.loc[data["Date"] == pd.to_datetime("2023-12-01"), f"{main_column}_Fitted"]
            filtered_data = filtered_data.fillna(1500)
            #print(filtered_data.head())
            plt.fill_between(filtered_data[date_column], 
                             filtered_data[ci_low_column], 
                             filtered_data[ci_upp_column], 
                             color='pink', alpha=0.3, label='Confidence Interval')

        # Add titles and labels
        plt.title('Model - Actual vs Forecast (Filtered)')
        plt.xlabel('Date')
        plt.ylabel('Load')
        if ci_low_column and ci_upp_column:
            plt.ylim(filtered_data[ci_low_column].min() - 100, filtered_data[ci_upp_column].max() + 100)
        else:
            plt.ylim(filtered_data[main_column].min() - 200, filtered_data[main_column].max() + 200)
        plt.legend()
        plt.grid()

        # Xticks
        xticks = filtered_data[date_column][::2]
        plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
        plt.show()

### Prophet Functions
def calculate_aic(col_true, col_prediction, num_params=50):
    # To common indexes
    indexes = set(col_true.dropna().index) & set(col_prediction.dropna().index)
    col_true = col_true[indexes]
    col_prediction = col_prediction[indexes]

    # Calculate residuals and variance
    residuals = col_true.values - col_prediction.values
    rss = np.sum(residuals ** 2)
    sigma2 = np.var(residuals)

    # Number of parameters
    k = num_params
    
    # Number of observations
    n = len(col_true.values)

    #Likelihood
    log_likelihood = -n / 2 * (np.log(2 * np.pi) + np.log(sigma2) + rss / (n * sigma2))

    # AIC
    aic = 2 * k - 2 * log_likelihood

    return aic

def prophet_model_train(model, data_model_train, data_model_test, main_column, plotting=True, *args, **kwargs):

    # Initialize and fit the Prophet model
    model.fit(data_model_train)
    
    # Create a DataFrame for future predictions (extend the time range if necessary)
    future = model.make_future_dataframe(periods=data_model_test.shape[0], freq='M')  # Forecast 10 months into the future
    forecast = model.predict(future)

    if plotting:      
        # Plot forecast components (trend, seasonality, etc.)
        fig = model.plot_components(forecast, figsize=(6, 6))
        plt.xticks(rotation=45)
        plt.show()

    # Calculate Metrics
    data_check_metrics = pd.concat([data_model_train, data_model_test])
    data_check_metrics['Date'] = data_check_metrics['ds']
    data_check_metrics[main_column] = data_check_metrics['y']
    data_check_metrics = data_check_metrics[['Date', 'Date_Graph', main_column]]
    data_check_metrics = data_check_metrics.reset_index()
    train_size = data_model_train.shape[0]
    test_size = data_model_test.shape[0]

    # Making dataset to calculate metrics
    fitted = forecast.iloc[0:train_size]['yhat']
    data_check_metrics[f"{main_column}_Fitted"] = pd.concat([fitted, pd.Series([None]*test_size)]).values
    
    # Add Predictions and Confidence Intervals
    predicted = forecast.iloc[train_size:data_check_metrics.shape[0]]['yhat']
    data_check_metrics[f"{main_column}_Prediction"] = pd.concat([pd.Series([None]*train_size), predicted]).values
    
    data_check_metrics[f"{main_column}_Prediction_CI_low"] = forecast['yhat_lower'].values
        
    data_check_metrics[f"{main_column}_Prediction_CI_upp"] = forecast['yhat_upper'].values
    
    return model, data_check_metrics, forecast

def calculate_model_metrics_prophet(data_check_metrics, main_column):
    metrics = []  # List to store metrics for TRAIN and TEST

    # Calculate metrics - TRAIN
    mae_train = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]))
    mse_train = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) ** 2)
    rmse_train = np.sqrt(mse_train)
    mape_train = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Fitted"]) / data_check_metrics[main_column])) * 100
    aic_train = calculate_aic(data_check_metrics[main_column], data_check_metrics[f"{main_column}_Fitted"])
    
    # Store TRAIN metrics in the list
    metrics.append({
        "Dataset": "TRAIN",
        "MAE": mae_train,
        "MSE": mse_train,
        "RMSE": rmse_train,
        "MAPE (%)": mape_train,
        "AIC": aic_train
    })

    # Calculate metrics - TEST
    mae_test = np.mean(np.abs(data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]))
    mse_test = np.mean((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) ** 2)
    rmse_test = np.sqrt(mse_test)
    mape_test = np.mean(np.abs((data_check_metrics[main_column] - data_check_metrics[f"{main_column}_Prediction"]) / data_check_metrics[main_column])) * 100

    # Store TEST metrics in the list
    metrics.append({
        "Dataset": "TEST",
        "MAE": mae_test,
        "MSE": mse_test,
        "RMSE": rmse_test,
        "MAPE (%)": mape_test,
        "AIC": None
    })

    # Convert metrics to a DataFrame
    metrics_df = pd.DataFrame(metrics)
    metrics_df.index = metrics_df['Dataset']
    metrics_df.drop(columns=['Dataset'], inplace=True)
    metrics_df = metrics_df.T

    # Print the metrics
    print("*** TRAIN ***")
    print(f"MAE: {mae_train:.2f}")
    print(f"MSE: {mse_train:.2f}")
    print(f"RMSE: {rmse_train:.2f}")
    print(f"MAPE: {mape_train:.2f}%")
    print(f"AIC: {aic_train}")

    print("\n*** TEST ***")
    print(f"MAE: {mae_test:.2f}")
    print(f"MSE: {mse_test:.2f}")
    print(f"RMSE: {rmse_test:.2f}")
    print(f"MAPE: {mape_test:.2f}%")

    return metrics_df

def calcualate_model_performance_prophet(model, data_check_metrics, main_column):
    # Assume `df` is your DataFrame with columns 'real' and 'fitted'
    train_size = data_check_metrics[f"{main_column}_Fitted"].dropna().shape[0]
    # Replace 'real' and 'fitted' with your actual column names
    real = data_check_metrics[main_column].iloc[0:train_size]
    fitted = data_check_metrics[f"{main_column}_Fitted"].iloc[0:train_size]
    
    # Calculate residuals
    data_check_metrics['residuals'] = real - fitted
    residuals = data_check_metrics['residuals'].dropna()

    from statsmodels.stats.diagnostic import acorr_ljungbox

    # Perform the Ljung-Box test on residuals
    # df['residuals'] should already be computed
    ljung_box_results = acorr_ljungbox(data_check_metrics['residuals'].dropna(), lags=[10], return_df=True)
    
    # Display results
    print('\n')
    print('Ljung-Box Test')
    print(ljung_box_results)

    # Residuals plot
    plt.figure(figsize=(10, 6))
    plt.plot(residuals, label='Residuals', color='blue')
    plt.axhline(0, color='red', linestyle='--', linewidth=1)
    plt.title('Residuals Over Time')
    plt.xlabel('Time')
    plt.ylabel('Residuals')
    plt.legend()
    plt.show()
    
    # Histogram of residuals
    plt.figure(figsize=(8, 5))
    plt.hist(residuals, bins=30, color='gray', edgecolor='black')
    plt.title('Histogram of Residuals')
    plt.xlabel('Residual Value')
    plt.ylabel('Frequency')
    plt.show()
    
    # ACF and PACF of residuals
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plot_acf(residuals, lags=10, ax=plt.gca(), title="ACF of Residuals")
    plt.subplot(1, 2, 2)
    plot_pacf(residuals, lags=10, ax=plt.gca(), title="PACF of Residuals")
    plt.tight_layout()
    plt.show()

def plot_model_results_prophet(data, main_column, date_column='Date_Graph', 
                         prediction_column=None, fitted_column=None,
                         ci_low_column=None, ci_upp_column=None, 
                         cutoff_date=None, vertical_line_date=None, 
                         vertical_line_ymin=None, vertical_line_ymax=None,
                      figsize=None):
    """
    Plot Prophet model results, including actual, fitted, and predicted values with confidence intervals.

    Parameters:
        data (pd.DataFrame): DataFrame containing the time series data.
        main_column (str): Column name for the actual values.
        date_column (str): Column name for the date. Default is 'Date_Graph'.
        prediction_column (str): Column name for predictions.
        fitted_column (str): Column name for fitted values.
        ci_low_column (str): Column name for lower confidence interval.
        ci_upp_column (str): Column name for upper confidence interval.
        cutoff_date (str): Date to filter the data for predictions. Format: 'YYYY-MM-DD'.
        vertical_line_date (str): Date for a vertical line. Format: 'YYYY-MM-DD'.
        vertical_line_ymin (float): Y-axis minimum for the vertical line.
        vertical_line_ymax (float): Y-axis maximum for the vertical line.

    Returns:
        None: Displays the plot.
    """
    if figsize is None:
        figsize = (10, 6)
    # Ensure date_column is in datetime format
    data[date_column] = pd.to_datetime(data[date_column], errors='coerce')

    # Plot the data
    plt.figure(figsize=figsize)
    plt.plot(data[date_column], data[main_column], label='Actual', color="orange", alpha=0.7)

    if fitted_column:
        plt.plot(data[date_column], data[fitted_column], label='Fitted', color='blue', linestyle="dashed", linewidth=3)

    if prediction_column:
        plt.plot(data[date_column], data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')

    if vertical_line_date:
        plt.vlines(x=pd.to_datetime(vertical_line_date), linestyles='--', color='orange', ymin=vertical_line_ymin, ymax=vertical_line_ymax)

    # Add titles and labels
    plt.title('Model - Actual vs Fitted and Forecast')
    plt.xlabel('Date')
    plt.ylabel('Load')
    plt.ylim(data[main_column].min() - 200, data[main_column].max() + 200)
    plt.legend()
    plt.grid()

    # Xticks
    xticks = data[date_column][::2]  # Select every second tick
    plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
    plt.show()

    if cutoff_date:
        # Filter the dataset for plotting predictions
        filtered_data = data.loc[data[date_column] > pd.to_datetime("2023-11-30")]
        
        if prediction_column:
            filtered_data[prediction_column] = filtered_data[prediction_column].fillna(filtered_data[fitted_column])

        # Plot the filtered data
        plt.figure(figsize=figsize)
        plt.plot(filtered_data[date_column], filtered_data[main_column], label='Actual', color="orange", alpha=0.7)

        if prediction_column:
            plt.plot(filtered_data[date_column], filtered_data[prediction_column], label='Predicted', linestyle="dashdot", linewidth=3, color='red')

        if ci_low_column and ci_upp_column:
            filtered_data.loc[filtered_data["Date"] == pd.to_datetime("2023-12-01"), 
                f"{main_column}_Prediction"] = data.loc[data["Date"] == pd.to_datetime("2023-12-01"), f"{main_column}_Fitted"]
            filtered_data = filtered_data.fillna(1500)
            #print(filtered_data.head())
            plt.fill_between(filtered_data[date_column], 
                             filtered_data[ci_low_column], 
                             filtered_data[ci_upp_column], 
                             color='pink', alpha=0.3, label='Confidence Interval')

        # Add titles and labels
        plt.title('Model - Actual vs Forecast (Filtered)')
        plt.xlabel('Date')
        plt.ylabel('Load')
        if ci_low_column and ci_upp_column:
            plt.ylim(filtered_data[ci_low_column].min() - 100, filtered_data[ci_upp_column].max() + 100)
        else:
            plt.ylim(filtered_data[main_column].min() - 200, filtered_data[main_column].max() + 200)
        plt.legend()
        plt.grid()

        # Xticks
        xticks = filtered_data[date_column][::2]
        plt.xticks(xticks, xticks.dt.strftime('%Y-%m-%d'), rotation=45)
        plt.show()
In [ ]: